library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(polycor)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr) ; library(kableExtra)
library(foreach) ; library(doParallel)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}


Load Dataset


clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)

# Dataset created with 20_04_07_create_dataset.html
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
rownames_dataset = rownames(dataset)
dataset = dataset %>% mutate(Module = clusterings$Module, gene.score = as.character(gene.score)) %>%
                      mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score)) %>%
          dplyr::select(-matches(clustering_selected))
rownames(dataset) = rownames_dataset

# Fix any Gene Significance that is NA
GS_missing = rownames(dataset)[is.na(dataset$GS)]
if(length(GS_missing)>0){
  # Gandal dataset
  load('./../Data/preprocessed_data.RData')
  datExpr = datExpr %>% data.frame
  
  for(g in GS_missing){
    dataset$GS[rownames(dataset) == g] = polyserial(as.numeric(datExpr[g,]), datMeta$Diagnosis)
  }
}



# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)


rm(getinfo, mart, rownames_dataset, GO_annotations, g, GS_missing)


The features that will be considered for the classification model will be the ones WGCNA uses to identify significant modules and genes:



Filtering the 138 genes that were not assigned to any cluster (represented as the gray cluster)

rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character

new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor)) %>% 
              dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=gene.score!='None') %>% dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']

rm(rm_cluster)


Summary of the changes made to the original WGCNA variables:


  • Using Module Membership variables instead of binary module membership

  • Including a new variable with the absolute value of GS

  • Removing genes assigned to the gray module (unclassified genes)

  • Adding the Objective variable: Binary label indicating if it’s in the SFARI dataset or not

table_info = new_dataset %>% head(5) %>% t 

table_info %>% kable(caption = '(Transposed) features and their values for the first rows of dataset', 
                     col.names = colnames(table_info)) %>% kable_styling(full_width = F)
(Transposed) features and their values for the first rows of dataset
ENSG00000000003 ENSG00000000419 ENSG00000000457 ENSG00000000460 ENSG00000000938
MTcor 0.5650633 0.2024116 -0.8612290 0.1122683 0.9612374
GS 0.3856082 0.0330853 -0.3937957 -0.2418481 0.4196586
MM.00BE6E 0.0922553 0.0349765 -0.0890836 0.2487132 0.0451893
MM.77AF00 0.2199591 0.2824804 -0.2112288 0.0598775 0.2502654
MM.E98428 0.1328112 0.5224834 -0.0918756 0.3717968 -0.1231391
MM.A08CFF 0.2024525 0.0815135 -0.2001277 -0.1452052 0.2712711
MM.00C0B3 0.0815588 0.2369206 -0.1110010 0.1260077 0.2460671
MM.00C1A7 0.2815932 0.0702776 -0.2809204 -0.0886543 0.3920586
MM.4BB400 -0.2319119 -0.0459663 0.2304153 0.2787641 -0.1447272
MM.00ABFD -0.1179976 0.0221085 0.0641125 0.1968199 0.1703302
MM.C39A00 -0.1073985 0.0985058 0.0051635 0.2543768 0.0234781
MM.44A0FF 0.0725256 0.0210242 -0.1861562 -0.0363138 0.2709805
MM.96A900 -0.1184551 -0.2201542 0.0072814 -0.0657402 0.1906476
MM.F47B5B -0.0698187 0.3021432 -0.0195570 0.2424089 -0.0914334
MM.00BF7D 0.0052768 0.1737149 -0.0730145 0.0143253 -0.1362776
MM.EF67EB 0.1979762 -0.0057964 -0.1415817 -0.1884690 0.0441187
MM.FB61D8 0.2532797 0.2596606 -0.1685126 0.1603773 -0.0682160
MM.FF66A9 0.3289569 -0.0777291 -0.3149048 -0.3983808 0.2569934
MM.00B0F6 0.0938223 -0.0335679 -0.2462794 0.0789581 0.0741229
MM.64B200 -0.0556433 0.0215751 -0.0760139 -0.0474915 0.3149536
MM.00C19A 0.1278939 -0.0238796 -0.3257874 -0.2659576 0.1003769
MM.6C9AFF 0.1875213 0.2571035 -0.3189170 -0.2042838 0.2097051
MM.88AC00 0.3083110 0.0657020 -0.3673810 -0.2789542 0.3482581
MM.FE61CD 0.2735454 -0.0749574 -0.2660101 -0.4092106 0.2179956
MM.A3A500 0.4885859 -0.1598076 -0.1265973 -0.2103806 0.1708103
MM.B99E00 0.3672758 -0.0798395 -0.1843762 -0.1208432 0.3755777
MM.C37EFF 0.4850096 0.1223949 -0.2840792 -0.2339267 0.3913096
MM.FB727D 0.4281761 -0.0335896 -0.2207502 -0.3633790 0.2932789
MM.B385FF 0.1743634 0.0705221 -0.1387095 -0.0326525 0.3436674
MM.D49200 0.2305061 0.2920983 -0.1482150 0.0619138 0.0675482
MM.CC9600 -0.1114272 -0.4158161 0.2247263 -0.0614412 -0.0086720
MM.DD71FA -0.0120376 -0.3683190 0.1526563 -0.1847912 -0.0511214
MM.FF699B 0.1832787 -0.2059584 -0.2345238 -0.3807072 0.1870144
MM.AEA200 0.0364279 -0.2351406 0.0246775 -0.2377677 -0.0114991
MM.F663E2 -0.0396559 -0.3546137 0.0628408 -0.2346651 -0.0989864
MM.00BDD4 0.0199877 -0.0136117 -0.0038863 -0.0649262 -0.2838784
MM.00BD5C 0.0140797 -0.0814519 0.2181220 0.1622569 -0.1994018
MM.E28900 -0.0589930 0.0368712 0.1610854 -0.0392749 -0.2253103
MM.FF63B6 -0.2520685 -0.0531543 0.3478394 0.0626461 -0.4204959
MM.00C08C 0.1614500 -0.1082819 -0.1543919 -0.1680506 0.1756669
MM.FE6D8D -0.0025549 -0.0425405 0.0111406 -0.0212189 -0.1031666
MM.1FB700 -0.3351864 -0.0465819 0.1693029 -0.0357561 0.0445248
MM.00B7E7 -0.0786113 0.0071915 0.0762654 0.0876867 -0.1733791
MM.00A6FF -0.1739419 -0.2078984 0.1180385 0.1078684 -0.0914032
MM.00C0BF -0.1973047 -0.1922326 0.1299675 -0.0144399 -0.2454681
MM.00BECA -0.2855911 0.0260396 0.3012228 0.2956419 -0.1086428
MM.00B92F -0.2259570 0.1128775 0.1980964 0.2720306 -0.1161659
MM.00BB48 -0.1229260 0.1052040 0.1167934 0.2665806 0.0580013
MM.00B4EF -0.1745230 -0.2022552 0.3304780 0.1717256 -0.2292696
MM.00BADE -0.2710436 -0.0933350 0.3498405 0.1704545 -0.4001771
MM.D177FF -0.3147175 -0.2562477 0.3499766 -0.0150251 -0.3092823
MM.8993FF -0.1568785 -0.3659637 0.2652017 0.0403313 -0.0740675
MM.E76BF3 -0.1639996 -0.3652531 0.2458832 0.1317597 -0.1253242
MM.DB8E00 -0.1024531 0.3704257 0.1675426 0.4143045 -0.2610028
MM.EE8045 -0.3689161 0.1718650 0.4312384 0.3901047 -0.4141214
MM.F8766D -0.2679269 0.2957557 0.2873824 0.3340027 -0.4458381
MM.FF61C2 -0.1621256 0.3632301 0.2470977 0.4848771 -0.4595864
absGS 0.3856082 0.0330853 0.3937957 0.2418481 0.4196586
SFARI 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
rm(table_info)
original_dataset = dataset
dataset = new_dataset

rm(new_dataset)

The final dataset contains 15994 observations (genes) and 59 variables



Exploratory Analysis


PCA of Variables


The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS are in the middle of both groups

pca = dataset %>% mutate(SFARI = as.numeric(SFARI)) %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))


mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
         scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
         ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, pca)


PCA of Samples


  • The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module

  • Mean Expression doesn’t seem to play an important role

  • SFARI Genes seem to be evenly distributed everywhere

  • It’s not clear what the 2nd principal component is capturing

# Mean Expression data
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() + 
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)



Dividing samples into Training and Test Sets


4.9% of the observations are positive. This can be a problem when training the classification model, so the samples in the training set should be balanced between classes before the model is trained.

table_info = dataset %>% apply_labels(SFARI = 'SFARI')

cro(table_info$SFARI)
 #Total 
 SFARI 
   FALSE  15211
   TRUE  783
   #Total cases  15994
rm(table_info)

To divide our samples into training and test sets:

set.seed(123)

sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>% 
                left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                          by = 'ID') %>% 
                mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

train_idx = createDataPartition(sample_scores$gene.score, p = 0.7, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]


# # Validation Set
# aux_dataset = dataset[-train_idx,]
# validation_idx = createDataPartition(sample_scores$gene.score[-train_idx], p = 0.5, list = FALSE)
# validation_set = aux_dataset[validation_idx,]
# 
# # Test Set
# test_set = aux_dataset[-valudation_idx,]


rm(sample_scores, train_idx)


Label distribution in training set


To fix this class imbalance, we are going to use SMOTE, an over-sampling technique that over-samples the minority class (SFARI Genes) by creating synthetic examples, in the training set

cro(train_set$SFARI)
 #Total 
 train_set$SFARI 
   FALSE  10648
   TRUE  549
   #Total cases  11197


Labels distribution in test set


This set is used just to evaluate how well the model performs, so the class imbalance is not a problem here

cro(test_set$SFARI)
 #Total 
 test_set$SFARI 
   FALSE  4563
   TRUE  234
   #Total cases  4797


Logistic Regression


Train model

# https://shiring.github.io/machine_learning/2017/04/02/unbalanced
# https://topepo.github.io/caret/using-your-own-model-in-train.html#Illustration5


train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)

smote_over_sampling = trainControl(method = 'repeatedcv', number = 10, repeats = 10, verboseIter = FALSE, 
                                   classProbs = TRUE, savePredictions = 'final', 
                                   summaryFunction = twoClassSummary, sampling = 'smote')

# Using ROC as metric because it doesn't depend on the threshold
fit = caret::train(SFARI ~ ., data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
                   trControl = smote_over_sampling)

#a = caret::thresholder(fit, threshold = seq(0.05, 0.95, by = 0.1), statistics = 'all')

rm(smote_over_sampling)


Performance


The model has an AUC of 0.6721

But the features are strongly correlated, which inflates the standard error of the coefficients, making them no longer interpretable, so perhaps it would be better to use another model

# VIF
plot_data = data.frame('Feature' = car::vif(fit$finalModel) %>% sort %>% names,
                       'VIF' = car::vif(fit$finalModel) %>% sort %>% unname) %>%
            mutate(outlier = VIF>10)

plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') + 
              scale_y_log10() + geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') + 
              xlab('Model Features') + ggtitle('Variance Inflation Number for each Feature') + theme_minimal() +
              theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))

rm(plot_data)

Correlation plot

# Correlation plot
corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6, 
               tl.pos = 'l', tl.col = '#666666')

rm(clusterings)


Possible solutions to Multicollinearity:


  1. Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal

  2. Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study

  3. Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression

  4. Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again




Ridge Regression


Notes:

### DEFINE FUNCTIONS

create_train_test_sets = function(p, seed){
  
  # Get SFARI Score of all the samples so our train and test sets are balanced for each score
  sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
                  left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score), 
                             by = 'ID') %>% 
                  mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))

  set.seed(seed)
  train_idx = createDataPartition(sample_scores$gene.score, p = p, list = FALSE)
  
  train_set = dataset[train_idx,]
  test_set = dataset[-train_idx,]
  
  return(list('train_set' = train_set, 'test_set' = test_set))
}



run_model = function(p, seed){
  
  # Create train and test sets
  train_test_sets = create_train_test_sets(p, seed)
  train_set = train_test_sets[['train_set']]
  test_set = train_test_sets[['test_set']]
  
  # Train Model
  train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
  lambda_seq = 10^seq(1, -4, by = -.1)
  set.seed(seed)
  k_fold = 10
  cv_repeats = 5
  smote_over_sampling = trainControl(method = 'repeatedcv', number = k_fold, repeats = cv_repeats,
                                     verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final', 
                                     summaryFunction = twoClassSummary, sampling = 'smote')
  fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = smote_over_sampling, metric = 'ROC',
              tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
  
  # Predict labels in test set
  predictions = fit %>% predict(test_set, type = 'prob')
  preds = data.frame('ID' = rownames(test_set), 'prob' = predictions$SFARI) %>% mutate(pred = prob>0.5)

  # Measure performance of the model
  acc = mean(test_set$SFARI==preds$pred)
  prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
  pred_ROCR = prediction(preds$prob, test_set$SFARI)
  AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
  
  # Extract coefficients from features
  coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
  
  return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1, 'AUC' = AUC, 'preds' = preds, 'coefs' =coefs))
}


### RUN MODEL

# Parameters
p = 0.75

n_iter = 25
seeds = 123:(123+n_iter-1)

# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)

for(seed in seeds){
  
  # Run model
  model_output = run_model(p, seed)
  
  # Update outputs
  acc = c(acc, model_output[['acc']])
  prec = c(prec, model_output[['prec']])
  rec = c(rec, model_output[['rec']])
  F1 = c(F1, model_output[['F1']])
  AUC = c(AUC, model_output[['AUC']])
  preds = model_output[['preds']]
  coefs$coef = coefs$coef + model_output[['coefs']]
  update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
  predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in% 
                                                                      preds$ID, c('prob','pred','n')] +
                                                                    update_preds
}

coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)


rm(p, seeds, update_preds, create_train_test_sets, run_model)


To summarise in a single value the predictions of the models:

test_set = predictions %>% filter(n>0) %>% 
           left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]


Performance metrics


Confusion matrix

conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels', 
                                     prob = 'Assigned Probability', 
                                     pred = 'Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 Label Prediction     #Total 
 FALSE   TRUE   
 Actual Labels 
   FALSE  12124 3073   15197
   TRUE  416 366   782
   #Total cases  12540 3439   15979
rm(conf_mat)


Accuracy: Mean = 0.78 SD = 0.0102


Precision: Mean = 0.1066 SD = 0.0084


Recall: Mean = 0.4745 SD = 0.0363


F1 score: Mean = 0.174 SD = 0.0133


ROC Curve: Mean = 0.6915 SD = 0.0195

pred_ROCR = prediction(test_set$prob, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')


Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)




Coefficients


MTcor and absGS have a very small coefficient and Gene Significance has a negative coefficient

gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>% 
                 left_join(assigned_module, by ='ID') %>% mutate(Module = gsub('#','',Module))

coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>% 
            left_join(gene_corr_info, by = c('feature' = 'Module')) %>% 
            dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>% 
            summarise('SFARI_perc' = mean(SFARI)) %>% arrange(desc(coef))

coef_info %>% dplyr::select(feature, coef) %>% filter(feature %in% c('Intercept','GS','absGS','MTcor')) %>%
              dplyr::rename('Feature' = feature, 'Coefficient' = coef) %>% 
              kable(align = 'cc', caption = 'Regression Coefficients  (filtering MM features)') %>% 
              kable_styling(full_width = F)
Regression Coefficients (filtering MM features)
Feature Coefficient
MTcor -0.0442725
absGS -0.0645774
GS -0.0934556
Intercept -0.5486990


  • There is a positive relation between the coefficient assigned to the membership of each module and the enrichment (using ORA) in SFARI genes that are assigned to that module
load('./../Data/ORA.RData')

enrichment_SFARI_info = data.frame('Module'=as.character(), 'SFARI_enrichment'=as.numeric())
for(m in names(enrichment_SFARI)){
  m_info = enrichment_SFARI[[m]]
  enrichment = 1-ifelse('SFARI' %in% m_info$ID, m_info$pvalue[m_info$ID=='SFARI'],1)
  enrichment_SFARI_info = enrichment_SFARI_info %>% 
                          add_row(Module = gsub('#','',m), SFARI_enrichment = enrichment)
}

plot_data = coef_info %>% dplyr::rename('Module' = feature) %>% 
            left_join(enrichment_SFARI_info, by = 'Module') %>% filter(!is.na(MTcor))

ggplotly(plot_data %>% ggplot(aes(coef, SFARI_enrichment)) + 
         geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
         geom_point(aes(id = Module), color = paste0('#',plot_data$Module), alpha=0.7) + 
         theme_minimal() + xlab('Coefficient') + 
         ylab('SFARI Genes Enrichment'))
rm(enrichment_old_SFARI, enrichment_DGN, enrichment_DO, enrichment_GO, enrichment_KEGG, enrichment_Reactome,
   m, m_info, enrichment)


  • There doesn’t seem to be a relation between the coefficient and the correlation of the module and the diagnosis.

This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)

ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
              ggplot(aes(coef, MTcor)) +  geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) + 
              geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)]), 
                         alpha = 0.7) + 
              theme_minimal() + xlab('Coefficient') + 
              ylab('Module-Diagnosis correlation'))




Analyse Results


Score distribution by SFARI Label


SFARI genes have a higher score distribution than the rest, but the overlap is large

plot_data = test_set %>% dplyr::select(prob, SFARI)

ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
         geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
         geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
         theme_minimal() + ggtitle('Model score distribution by SFARI Label'))


Score distribution by SFARI Score


Even though we didn’t use the actual SFARI Scores to train the model, but instead we grouped them all together, there seems to be a statistically significant positive relation between the SFARI scores and the probability assigned by the model

plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                       gene.score)) %>%
            dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  144
   2  205
   3  433
   Neuronal  928
   Others  14269
   #Total cases  15979
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))

comparisons = list(c('1','2'), c('2','3'), c('3','Neuronal'), c('Neuronal','Others'),
                   c('1','3'), c('3','Others'), c('2','Neuronal'),
                   c('1','Neuronal'), c('2','Others'), c('1','Others'))
increase = 0.08
base = 0.9
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)

plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, 
                                 tip.length = .02) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              ggtitle('Distribution of probabilities by SFARI score') +
              xlab('SFARI score') + ylab('Probability') + theme_minimal() + theme(legend.position = 'none')

rm(mean_vals, increase, base, pos_y_comparisons)


Genes with the highest Probabilities


  • Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:3)

  • High concentration of genes with a SFARI Score of 1

test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>%
             mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'), 
                                        gene.score)) %>%
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' =MTcor,
                           'GeneSignificance' = GS) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr, 4), Probability = round(Probability, 4), 
                    GeneSignificance = round(GeneSignificance, 4)) %>%
             dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
                           gene.score) %>%
             kable(caption = 'Genes with highest model probabilities from the test set') %>% 
             kable_styling(full_width = F)
Genes with highest model probabilities from the test set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
ARID1B 0.2675 0.3428 #00C08C 0.8685 1
MYCBP2 -0.3960 -0.3847 #00B7E7 0.8678 Neuronal
TANC2 -0.2362 -0.5573 #00C0BF 0.8649 1
EP300 -0.0235 0.3428 #00C08C 0.8648 1
PRDM2 -0.2516 0.0721 #96A900 0.8572 Others
RPRD2 -0.0931 0.3428 #00C08C 0.8562 Others
NFAT5 0.1025 0.0721 #96A900 0.8520 Others
ANK2 -0.4350 -0.9003 #00BADE 0.8489 1
KMT2A 0.1343 0.6175 #44A0FF 0.8449 1
KMT2D -0.3274 -0.5573 #00C0BF 0.8414 Others
RFX7 0.1381 0.0721 #96A900 0.8377 Others
TLE3 0.4865 0.5341 #EF67EB 0.8357 Others
FMNL1 -0.2229 -0.3847 #00B7E7 0.8340 Others
HIVEP2 0.0188 -0.9003 #00BADE 0.8321 1
MBD5 0.1916 0.0721 #96A900 0.8285 1
HUNK -0.3276 -0.3731 #4BB400 0.8265 Others
SRRM4 -0.4423 -0.5573 #00C0BF 0.8256 Others
SRCAP -0.3631 -0.5573 #00C0BF 0.8249 1
C20orf112 0.0294 0.3428 #00C08C 0.8239 Others
ANK3 -0.4808 -0.8123 #00BECA 0.8234 1
RAPH1 -0.0554 -0.3847 #00B7E7 0.8220 Others
TET3 0.4848 0.5341 #EF67EB 0.8205 Others
EP400 -0.1699 -0.5573 #00C0BF 0.8201 2
NRXN3 -0.6712 -0.9003 #00BADE 0.8184 1
SAP130 -0.2497 -0.5573 #00C0BF 0.8148 Others
TLN2 -0.4786 -0.7233 #FF63B6 0.8142 Others
RNF111 -0.2393 0.0721 #96A900 0.8138 Others
ARID1A -0.1393 -0.5573 #00C0BF 0.8116 Others
ASAP1 -0.0917 -0.3847 #00B7E7 0.8115 Others
CREBBP 0.1424 0.3428 #00C08C 0.8112 1
GRIN2B -0.2787 -0.7233 #FF63B6 0.8110 1
DAAM1 0.1540 -0.0127 #00BF7D 0.8102 Others
KCNJ6 -0.1389 -0.9003 #00BADE 0.8100 Others
CACNG3 -0.4698 -0.6770 #00A6FF 0.8100 Others
CELF2 -0.0655 -0.9003 #00BADE 0.8096 Others
DROSHA -0.4103 -0.5573 #00C0BF 0.8088 Others
BRPF3 -0.4401 -0.5573 #00C0BF 0.8081 Others
PLXNC1 -0.0093 0.5341 #EF67EB 0.8080 Others
SIK3 -0.1236 -0.3847 #00B7E7 0.8060 Others
GLTSCR1L -0.1611 0.0721 #96A900 0.8051 Others
SPRED2 0.2565 0.5341 #EF67EB 0.8048 Others
AMPD3 -0.2812 -0.9003 #00BADE 0.8032 Others
POM121C -0.2811 -0.5573 #00C0BF 0.8031 Others
ST8SIA3 -0.1994 -0.7233 #FF63B6 0.8028 Others
BAZ2A 0.0518 0.0721 #96A900 0.8010 Others
RIMBP2 0.2592 0.5341 #EF67EB 0.8007 Others
CMIP -0.5025 -0.5573 #00C0BF 0.8006 3
UBR4 -0.2871 -0.5573 #00C0BF 0.7975 Others
ETV6 -0.1439 -0.6508 #E76BF3 0.7975 Others
RASGRF1 -0.7424 -0.9003 #00BADE 0.7973 Neuronal





Negative samples distribution


The objective of this model is to identify candidate SFARI genes. For this, we are going to focus on the negative samples (the non-SFARI genes)

negative_set = test_set %>% filter(!SFARI)

negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability', 
                                                   pred = 'Label Prediction')

cro(negative_set_table$pred)
 #Total 
 Label Prediction 
   FALSE  12124
   TRUE  3073
   #Total cases  15197

3073 genes are predicted as ASD-related


negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
                 geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
                 ggtitle('Probability distribution of the Negative samples in the Test Set') + 
                 theme_minimal()


negative_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(negative_set)) %>% 
                 arrange(desc(prob)) %>% top_n(50, wt = prob) %>%
                 left_join(original_dataset %>% mutate(ID = rownames(original_dataset)), by = 'ID')  %>%
                 mutate(gene.score = ifelse(gene.score=='None', 
                                            ifelse(ID %in% GO_neuronal$ID,'Neuronal','Others'), 
                                            gene.score)) %>%
                 left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
                 dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 
                               'ModuleDiagnosis_corr' =MTcor, 'GeneSignificance' = GS) %>%
                 mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr, 4), 
                        Probability = round(Probability, 4), 
                        GeneSignificance = round(GeneSignificance, 4)) %>%
                 dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
                               gene.score) %>%
                 kable(caption = 'Genes with highest model probabilities from the Negative set') %>% 
                 kable_styling(full_width = F)
Genes with highest model probabilities from the Negative set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
MYCBP2 -0.3960 -0.3847 #00B7E7 0.8678 Neuronal
PRDM2 -0.2516 0.0721 #96A900 0.8572 Others
RPRD2 -0.0931 0.3428 #00C08C 0.8562 Others
NFAT5 0.1025 0.0721 #96A900 0.8520 Others
KMT2D -0.3274 -0.5573 #00C0BF 0.8414 Others
RFX7 0.1381 0.0721 #96A900 0.8377 Others
TLE3 0.4865 0.5341 #EF67EB 0.8357 Others
FMNL1 -0.2229 -0.3847 #00B7E7 0.8340 Others
HUNK -0.3276 -0.3731 #4BB400 0.8265 Others
SRRM4 -0.4423 -0.5573 #00C0BF 0.8256 Others
C20orf112 0.0294 0.3428 #00C08C 0.8239 Others
RAPH1 -0.0554 -0.3847 #00B7E7 0.8220 Others
TET3 0.4848 0.5341 #EF67EB 0.8205 Others
SAP130 -0.2497 -0.5573 #00C0BF 0.8148 Others
TLN2 -0.4786 -0.7233 #FF63B6 0.8142 Others
RNF111 -0.2393 0.0721 #96A900 0.8138 Others
ARID1A -0.1393 -0.5573 #00C0BF 0.8116 Others
ASAP1 -0.0917 -0.3847 #00B7E7 0.8115 Others
DAAM1 0.1540 -0.0127 #00BF7D 0.8102 Others
KCNJ6 -0.1389 -0.9003 #00BADE 0.8100 Others
CACNG3 -0.4698 -0.6770 #00A6FF 0.8100 Others
CELF2 -0.0655 -0.9003 #00BADE 0.8096 Others
DROSHA -0.4103 -0.5573 #00C0BF 0.8088 Others
BRPF3 -0.4401 -0.5573 #00C0BF 0.8081 Others
PLXNC1 -0.0093 0.5341 #EF67EB 0.8080 Others
SIK3 -0.1236 -0.3847 #00B7E7 0.8060 Others
GLTSCR1L -0.1611 0.0721 #96A900 0.8051 Others
SPRED2 0.2565 0.5341 #EF67EB 0.8048 Others
AMPD3 -0.2812 -0.9003 #00BADE 0.8032 Others
POM121C -0.2811 -0.5573 #00C0BF 0.8031 Others
ST8SIA3 -0.1994 -0.7233 #FF63B6 0.8028 Others
BAZ2A 0.0518 0.0721 #96A900 0.8010 Others
RIMBP2 0.2592 0.5341 #EF67EB 0.8007 Others
UBR4 -0.2871 -0.5573 #00C0BF 0.7975 Others
ETV6 -0.1439 -0.6508 #E76BF3 0.7975 Others
RASGRF1 -0.7424 -0.9003 #00BADE 0.7973 Neuronal
GATAD2B -0.4236 -0.5573 #00C0BF 0.7972 Others
YLPM1 0.2269 0.6175 #44A0FF 0.7949 Others
FOXJ2 0.2507 0.3428 #00C08C 0.7944 Others
NAV1 -0.1759 -0.5573 #00C0BF 0.7928 Neuronal
SKI 0.2138 0.1909 #FF699B 0.7923 Others
TP53INP2 -0.2133 -0.4661 #1FB700 0.7923 Others
ZNF592 0.2727 0.3428 #00C08C 0.7916 Others
SCAF4 0.0161 -0.5573 #00C0BF 0.7912 Others
R3HDM2 -0.4093 -0.3847 #00B7E7 0.7891 Others
SPTAN1 -0.2888 -0.5573 #00C0BF 0.7885 Others
KIAA1109 -0.1158 0.0721 #96A900 0.7878 Others
MEF2D -0.4185 -0.5573 #00C0BF 0.7876 Others
HUWE1 -0.5029 -0.5573 #00C0BF 0.7859 Others
DLGAP4 -0.3338 -0.5573 #00C0BF 0.7846 Others




Probability and Gene Significance


  • There’s a lot of noise, but the probability the model assigns to each gene seems to have a negative relation with the Gene Significance (under-expressed genes having on average the higher probabilities and over-expressed genes the lowest)

  • The pattern is stronger in under-expressed genes

negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() + 
                 geom_smooth(method = 'loess', color = '#666666') +
                 geom_hline(yintercept = 0, color='gray', linetype='dashed') + 
                 xlab('Probability') + ylab('Gene Significance') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between Probability and Gene Significance') + theme_minimal()




Probability and Module-Diagnosis correlation


  • There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model

  • The model seems to assign slightly higher probabilities to genes belonging the modules with negative module-Dianosis correlations than to genes belonging to modules with positive ones

negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() + 
                 geom_smooth(method='loess', color='#666666') + 
                 geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) + 
                 xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
                 theme_minimal()




Summarised version, plotting by module instead of by gene


The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

The model seems to give lower probabilities to genes belonging to modules with a high (absolute) correlation to Diagnosis (this is unexpected)

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% 
            left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Module') + theme_minimal())




Probability and level of expression


There is a positive relation between level of expression and probability, the model seems to be capturing indirectly the level of expression of the genes to make the prediction, so it’s introducing the same bias

We had already found some mean expression bias in the SFARI scores, but thought we had eliminated this information when constructing the WGCNA network (because correlation is independent to linear transformations), but a signal related to this seems to have survived into the network and the model is picking it up

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('Mean Expression') + 
              ylab('Probability') + ggtitle('Mean expression vs model probability by gene') +
              theme_minimal()

rm(mean_and_sd)
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), 
                                                          n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) + 
         geom_point(color=plot_data2$Module, alpha = 0.6) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         xlab('Mean Level of Expression') + ylab('Average Model Probability') +
         theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)




Probability and LFC


There is a relation between probability and LFC, so it IS capturing a bit of true information (because LFC and mean expression were negatively correlated and it still has a positive relation in the model)

  • The relation is stronger in genes under-expressed in ASD
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('LFC') +
              xlab('LFC') + ylab('Probability') +
              theme_minimal() + ggtitle('LFC vs model probability by gene')

  • The relation is stronger in Differentially Expressed genes
p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>%
     ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
     geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + 
     ylim(c(min(plot_data$prob), max(plot_data$prob))) + 
     theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))

p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>% 
     ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.1) + 
     geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
     scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
     theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())

grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')

rm(p1, p2)



Conclusion


The model is capturing the mean level of expression of the genes (indirectly through module memberhsip), which is a strong bias found in the SFARI scores, but it seems to be capturing a bit of true biological signal as well (based on the GS and the log fold change plots)


Saving results

predictions = test_set %>% left_join(gene_names, by = c('ID' = 'ensembl_gene_id'))

save(predictions, dataset, fit, file='./../Data/Ridge_model.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] DMwR_0.4.1         doParallel_1.0.15  iterators_1.0.12   foreach_1.5.0     
##  [5] kableExtra_1.1.0   expss_0.10.2       corrplot_0.84      MLmetrics_1.1.1   
##  [9] car_3.0-7          carData_3.0-3      ROCR_1.0-7         gplots_3.0.3      
## [13] caret_6.0-86       lattice_0.20-41    polycor_0.7-10     biomaRt_2.40.5    
## [17] ggpubr_0.2.5       magrittr_1.5       RColorBrewer_1.1-2 gridExtra_2.3     
## [21] viridis_0.5.1      viridisLite_0.3.0  plotly_4.9.2       knitr_1.28        
## [25] forcats_0.5.0      stringr_1.4.0      dplyr_1.0.0        purrr_0.3.4       
## [29] readr_1.3.1        tidyr_1.1.0        tibble_3.0.1       ggplot2_3.3.2     
## [33] tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.1.0            RSQLite_2.2.0              
##   [3] AnnotationDbi_1.46.1        htmlwidgets_1.5.1          
##   [5] BiocParallel_1.18.1         pROC_1.16.2                
##   [7] munsell_0.5.0               codetools_0.2-16           
##   [9] miniUI_0.1.1.1              withr_2.2.0                
##  [11] colorspace_1.4-1            Biobase_2.44.0             
##  [13] highr_0.8                   rstudioapi_0.11            
##  [15] stats4_3.6.3                ggsignif_0.6.0             
##  [17] TTR_0.23-6                  labeling_0.3               
##  [19] GenomeInfoDbData_1.2.1      bit64_0.9-7                
##  [21] farver_2.0.3                vctrs_0.3.1                
##  [23] generics_0.0.2              ipred_0.9-9                
##  [25] xfun_0.12                   R6_2.4.1                   
##  [27] GenomeInfoDb_1.20.0         locfit_1.5-9.4             
##  [29] bitops_1.0-6                DelayedArray_0.10.0        
##  [31] assertthat_0.2.1            promises_1.1.0             
##  [33] scales_1.1.1                nnet_7.3-14                
##  [35] ggExtra_0.9                 gtable_0.3.0               
##  [37] timeDate_3043.102           rlang_0.4.6                
##  [39] genefilter_1.66.0           splines_3.6.3              
##  [41] lazyeval_0.2.2              ModelMetrics_1.2.2.2       
##  [43] acepack_1.4.1               broom_0.5.5                
##  [45] checkmate_2.0.0             yaml_2.2.1                 
##  [47] reshape2_1.4.4              abind_1.4-5                
##  [49] modelr_0.1.6                crosstalk_1.1.0.1          
##  [51] backports_1.1.8             quantmod_0.4.17            
##  [53] httpuv_1.5.2                Hmisc_4.4-0                
##  [55] tools_3.6.3                 lava_1.6.7                 
##  [57] ellipsis_0.3.1              BiocGenerics_0.30.0        
##  [59] Rcpp_1.0.4.6                plyr_1.8.6                 
##  [61] base64enc_0.1-3             progress_1.2.2             
##  [63] zlibbioc_1.30.0             RCurl_1.98-1.2             
##  [65] prettyunits_1.1.1           rpart_4.1-15               
##  [67] zoo_1.8-8                   S4Vectors_0.22.1           
##  [69] SummarizedExperiment_1.14.1 haven_2.2.0                
##  [71] cluster_2.1.0               fs_1.4.0                   
##  [73] data.table_1.12.8           openxlsx_4.1.4             
##  [75] reprex_0.3.0                matrixStats_0.56.0         
##  [77] hms_0.5.3                   mime_0.9                   
##  [79] evaluate_0.14               xtable_1.8-4               
##  [81] XML_3.99-0.3                rio_0.5.16                 
##  [83] jpeg_0.1-8.1                readxl_1.3.1               
##  [85] shape_1.4.4                 IRanges_2.18.3             
##  [87] compiler_3.6.3              KernSmooth_2.23-17         
##  [89] crayon_1.3.4                htmltools_0.4.0            
##  [91] mgcv_1.8-31                 later_1.0.0                
##  [93] Formula_1.2-3               geneplotter_1.62.0         
##  [95] lubridate_1.7.4             DBI_1.1.0                  
##  [97] dbplyr_1.4.2                MASS_7.3-51.6              
##  [99] Matrix_1.2-18               cli_2.0.2                  
## [101] gdata_2.18.0                gower_0.2.1                
## [103] GenomicRanges_1.36.1        pkgconfig_2.0.3            
## [105] foreign_0.8-76              recipes_0.1.10             
## [107] xml2_1.2.5                  annotate_1.62.0            
## [109] webshot_0.5.2               XVector_0.24.0             
## [111] prodlim_2019.11.13          rvest_0.3.5                
## [113] digest_0.6.25               rmarkdown_2.1              
## [115] cellranger_1.1.0            htmlTable_1.13.3           
## [117] curl_4.3                    shiny_1.4.0.2              
## [119] gtools_3.8.2                lifecycle_0.2.0            
## [121] nlme_3.1-147                jsonlite_1.7.0             
## [123] fansi_0.4.1                 pillar_1.4.4               
## [125] fastmap_1.0.1               httr_1.4.1                 
## [127] survival_3.1-12             xts_0.12-0                 
## [129] glue_1.4.1                  zip_2.0.4                  
## [131] png_0.1-7                   glmnet_3.0-2               
## [133] bit_1.1-15.2                class_7.3-17               
## [135] stringi_1.4.6               blob_1.2.1                 
## [137] DESeq2_1.24.0               latticeExtra_0.6-29        
## [139] caTools_1.18.0              memoise_1.1.0